INITIAL IMPORTS¶

In [1]:
import pandas as pd 
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)
import numpy as np 
import json
import warnings
warnings.filterwarnings("ignore")


# import data
df_base = pd.read_csv('data/Base.csv') 

# import feature data types
with open('data_types.json') as f:
    data_types = json.load(f) 

# encoding source {'INTERNET', 'TELEAPP'} into source_is_internet_not_teleapp
df_base['source_is_internet_not_teleapp'] = (df_base['source'] == 'INTERNET').astype(int)
df_base.drop('source', axis=1, inplace=True)


# recording which variables have missing values recorded as -1
missing_values = ['prev_address_months_count',
                  'current_address_months_count',
                  'bank_months_count',
                  'session_length_in_minutes',
                  'device_distinct_emails_8w']
In [2]:
import EDA
import importlib
importlib.reload(EDA)
FeatureSignificance = EDA.FeatureSignificance
FeatureVisualisation = EDA.FeatureVisualisation
OddsRatios = EDA.OddsRatios

DATA VISUALISATION¶

In [3]:
fv = FeatureVisualisation(df_base, target='fraud_bool')
fv.info()
ordinal                                            [Bar charts]
numerical                 [Density plots, Boxplots, Bar charts]
nominal_multi_category         [Bar charts, Stacked bar charts]
nominal_binary                                     [Bar charts]
missing value analysis                             [Bar charts]
dtype: object
In [4]:
fv.nominal_binary(data_types['nominal_binary'])
In [5]:
fv.nominal_multi_category(data_types['nominal_multi_category'])
In [6]:
fv.numerical(data_types['numerical_continuous_bounded']+data_types['numerical_continuous_unbounded']+data_types['numerical_discrete'])
In [7]:
fv.ordinal(data_types['ordinal']+data_types['temporal'])
In [8]:
fv.missing_values(variables=missing_values, missing_value_type=-1)

ODDS RATIOS¶

In [9]:
odds = OddsRatios(df_base, target='fraud_bool', missing_values=missing_values)
In [10]:
odds.binary_features(data_types['nominal_binary'])
Out[10]:
Odds Ratio
has_other_cards 0.318417
keep_alive_session 0.376444
phone_home_valid 0.469967
source_is_internet_not_teleapp 0.688442
phone_mobile_valid 0.702744
email_is_free 1.740679
foreign_request 2.069746
In [11]:
odds.multi_category_features(data_types['nominal_multi_category'])
Out[11]:
Category Odds Ratio
Variable
device_distinct_emails_8w 1 0.265266
device_distinct_emails_8w -1 1.010371
device_distinct_emails_8w 0 2.228911
device_distinct_emails_8w 2 4.116982
device_os linux 0.366027
device_os other 0.414704
device_os x11 1.016389
device_os macintosh 1.290321
device_os windows 4.098390
employment_status CF 0.167007
employment_status CE 0.206102
employment_status CD 0.333332
employment_status CB 0.586513
employment_status CG 1.407636
employment_status CA 1.549991
employment_status CC 2.386559
housing_status BE 0.271163
housing_status BG 0.357193
housing_status BF 0.377276
housing_status BC 0.438163
housing_status BB 0.466137
housing_status BD 0.776820
housing_status BA 6.878803
payment_type AE 0.311292
payment_type AA 0.402194
payment_type AD 0.978474
payment_type AB 1.032689
payment_type AC 1.845299
proposed_credit_limit 210.0 0.147893
proposed_credit_limit 510.0 0.412056
proposed_credit_limit 200.0 0.417136
proposed_credit_limit 190.0 0.553478
proposed_credit_limit 500.0 0.969119
proposed_credit_limit 1000.0 1.001126
proposed_credit_limit 490.0 1.254403
proposed_credit_limit 990.0 1.646218
proposed_credit_limit 1500.0 2.408798
proposed_credit_limit 2100.0 12.459369
proposed_credit_limit 2000.0 14.321557
proposed_credit_limit 1900.0 23.302432
In [12]:
odds.numerical_features(data_types['numerical'])
Out[12]:
Odds Ratio: name_email_similarity Odds Ratio: prev_address_months_count Odds Ratio: current_address_months_count Odds Ratio: days_since_request Odds Ratio: intended_balcon_amount Odds Ratio: zip_count_4w Odds Ratio: velocity_6h Odds Ratio: velocity_24h Odds Ratio: velocity_4w Odds Ratio: bank_branch_count_8w Odds Ratio: date_of_birth_distinct_emails_4w Odds Ratio: credit_risk_score Odds Ratio: bank_months_count Odds Ratio: session_length_in_minutes
Quantile
1/10 2.076870 0.659584 0.307934 1.267410 0.930345 0.900108 1.374300 1.347902 1.340229 NaN 3.062207 0.458805 NaN 0.906973
2/10 1.843790 0.909963 0.267626 1.226598 1.112497 0.969685 1.318376 1.176602 1.459979 1.810370 1.136016 0.599497 NaN 0.895999
3/10 1.258789 0.435000 0.348838 1.129227 1.210584 1.012211 0.959534 0.760031 0.965596 1.593486 1.068145 0.608259 0.746756 1.045007
4/10 0.985862 0.861667 0.528918 1.032816 1.205259 0.862397 0.942401 0.850785 0.792996 0.915409 0.976218 0.543386 0.958241 1.067742
5/10 0.614654 0.975363 0.882163 1.090630 1.193563 0.880684 1.067831 1.214849 1.125039 0.624212 1.135476 0.536810 0.748132 1.080184
6/10 0.602521 0.695424 1.595050 0.992976 1.401952 0.985928 1.017453 1.205259 0.946426 0.850675 1.019438 0.585681 0.870725 1.043977
7/10 0.656914 0.837131 1.902022 0.880456 1.403061 1.065372 0.924330 0.887408 0.776483 0.918453 0.644697 0.895755 0.689156 1.091619
8/10 0.895365 0.947552 1.985497 0.747490 0.788133 1.154645 0.922326 1.047202 0.853743 0.683165 0.596807 1.161041 1.213195 0.932036
9/10 0.734985 1.706445 1.297652 0.534098 0.412015 1.119696 0.881449 0.835040 0.727308 0.620117 0.498984 1.467943 0.961991 0.735338
10/10 0.596009 2.054606 1.295635 1.150237 0.469461 1.058216 0.636210 0.722524 1.069912 0.703369 0.546728 4.025545 2.064426 1.219279

STATS SCORES¶

In [13]:
fs = FeatureSignificance(df_base, target='fraud_bool')
fs.info()
temporal                        [Spearman Correlation, Chi-Square Test (Goodness of Fit)]
ordinal                           [Spearman Correlation, Logistic Regression Coefficient]
numerical_discrete                                   [Point-Biserial Correlation, T-Test]
numerical_continuous                             [Spearman Correlation, T-Test, K-S Test]
nominal_multi_category        [Chi-Square Test of Independence (Contingency), Cramér’s V]
nominal_binary            [Chi-Square Test of Independence (Contingency), Point-Biserial]
dtype: object
In [14]:
fs.correlations(data_types['numerical'])
In [15]:
fs.numerical_discrete(data_types['numerical_discrete'])
Out[15]:
Point-Biserial Correlation p-value t_stat t_stat p_value
Feature
credit_risk_score 0.070588 0.000000e+00 -60.091371 0.000000e+00
current_address_months_count 0.033480 7.586132e-245 -34.620596 1.082672e-249
bank_months_count 0.020929 4.312880e-73 -17.692433 1.458684e-68
prev_address_months_count 0.020083 5.238191e-27 -8.357725 2.417317e-16
zip_count_4w 0.005212 1.868284e-07 -5.210355 1.918071e-07
bank_branch_count_8w -0.011577 5.397590e-31 12.764196 4.674261e-37
date_of_birth_distinct_emails_4w -0.043224 0.000000e+00 44.852802 0.000000e+00
In [16]:
fs.numerical_continuous(data_types['numerical_continuous_bounded'] + data_types['numerical_continuous_unbounded'])
Out[16]:
Spearman Correlation Spearman p-value T-Statistic T-Test p-value K-S Statistic K-S p-value
Feature
session_length_in_minutes 0.002058 3.981677e-02 -7.450250 1.000315e-13 0.027061 2.318452e-07
velocity_24h -0.010509 7.791899e-26 11.511446 1.709035e-30 0.047764 4.602355e-22
velocity_4w -0.013524 1.118804e-41 10.882405 1.915534e-27 0.071602 4.519496e-49
days_since_request -0.014209 7.956264e-46 -0.535197 5.925240e-01 0.069237 6.583571e-46
velocity_6h -0.016497 3.816254e-61 17.508297 9.682576e-68 0.063211 2.493909e-38
intended_balcon_amount -0.017954 4.326131e-72 29.776915 1.070649e-187 0.143678 6.122341e-197
name_email_similarity -0.037283 1.936639e-304 35.923329 1.167023e-267 0.185558 0.000000e+00
In [17]:
fs.nominal_multi_category(data_types['nominal_multi_category'])
Out[17]:
Chi Square Statistic p-value (Chi-Square) cramers v
Feature
housing_status 13202.787719 0.0 0.114903
proposed_credit_limit 11738.244603 0.0 0.108343
device_os 6478.945928 0.0 0.080492
device_distinct_emails_8w 2234.990477 0.0 0.047276
employment_status 1572.499001 0.0 0.039655
payment_type 1528.342010 0.0 0.039094
In [18]:
fs.nominal_binary(data_types['nominal_binary'])
Out[18]:
Chi Square Statistic p-value (Chi-Square) Point-Biserial Correlation p-value (Point-Biserial)
Feature
email_is_free 769.952405 1.841219e-169 0.027758 1.216719e-169
foreign_request 284.060841 9.787932e-64 0.016885 5.722730e-64
source_is_internet_not_teleapp 14.937804 1.111140e-04 -0.003922 8.774983e-05
phone_mobile_valid 173.301368 1.406547e-39 -0.013180 1.140352e-39
phone_home_valid 1233.281326 3.567963e-270 -0.035128 1.734899e-270
has_other_cards 1235.161652 1.392459e-270 -0.035156 6.347503e-271
keep_alive_session 2528.754445 0.000000e+00 -0.050296 0.000000e+00
In [19]:
fs.ordinal(data_types['ordinal'])
Out[19]:
Spearman Correlation Spearman p-value Log Regression coef
Feature
income 0.049583 0.0 1.711414
customer_age 0.058146 0.0 0.045050
In [20]:
fs.temporal(data_types['temporal'])
Out[20]:
Spearman Correlation p-value (Spearman) Chi-Square Statistic p-value (Chi-Square)
Feature
month 0.012949 2.362542e-38 330.160493 2.166747e-67

KEY TAKEAWAYS¶

Thoughts:

  • foreign request much more likely to be fraud
  • has other cards much less likely to be fraud
  • keep session alive is big indicator, makes fraud much less likely
  • differences in employment_status result in very different fraud percentage CC very high, CF very low
  • BA housing_status is massive indicator of fraud
  • device_os possibly big indicator comparing windows to linux/other
  • device_distint_emails_8w being greater than 1 causes big increase in fraud percent
  • high proposed credit limit sample is very small but fraud goes up a lot, is there significant relationship with fraud
  • email_is_free shows quite a big difference and quite balanced population in dataset

  • current address months count missing is big fraud indicator, or when it's high

  • previous address months count is big factor, when missing fraud is much higher

  • date_of_birth_distinct_emails_4w is big fraud indicator when LOW

  • credit_risk_score big indicator for fraud when HIGH

  • high income and high customer age seem to correlate with higher rates of fraud ?

  • email_similarity is big indicator too?

  • veolocity_6h relationship with fraud is quite linear

clear non linear divisions in data:¶

  • FOREIGN REQUEST == 1

  • KEEP_ALIVE_SESSION == 1

  • HOUSING_STATUS is BA

  • EMPLOYMENT_STATUS shows big fluctuations (CC very high / CF very low)

  • PREVIOUS_ADDRESS_MONTHS_COUNT - Missing value

clear directional trends in data:¶

  • EMAIL_SIMILARITY negative correlation with fraud

  • CREDIT_RISK_SCORE positive correlation with fraud

  • DATE_OF_BIRTH_DISTINCT_EMAILS_4W negative correlation with fraud